## Header
# visualize microsat data 2016-2020 quarantine animals
# last updated 2-1-2022


## prep workspace
  rm(list=ls())
  require(adegenet)
  require(pegas)
  require(hierfstat)
  require(RColorBrewer)
  require(reshape2)
  cur.dir=("Q:\\NEW O DRIVE\\Population Conservation\\Genetics\\population structure\\")
  setwd(cur.dir)
  rm(cur.dir)
  
## read in tabular genotype data and convert to genid object
  d=read.table("data files\\tabular format subset to 190 samples.txt",colClasses = "character",header=TRUE)
  halbertloci=c("AGLA232",
                "BL1036",
                "BM1225","BM1706","BM17132","BM1824","BM1862","BM188","BM1905","BM2113","BM2830","BM4028","BM4107","BM4311","BM4440","BM47","BM711","BM720","BM757",
                "BMC4214",
                "BMS1001","BMS1074","BMS1117","BMS1172","BMS1315","BMS1355","BMS1675","BMS1716","BMS1747","BMS1857","BMS2258","BMS2639","BMS410","BMS510","BMS527","BMS528","BMS812","BMS941",
                "HUJ246",
                "IL4",
                "ILSTS102",
                "INRA037","INRA194",
                "TGLA122","TGLA44","TGLA53")
  pop=rep("pop",nrow(d))
  loci=d[,c(9:ncol(d))]
  loci.halbert=d[,colnames(d)%in%halbertloci]
  df=df2genind(X=loci,sep="/",ind.names=d$ID,ploidy = 2,NA.char="000",pop=pop) #create genid object
  dfsub=df2genind(X=loci.halbert,sep="/",ind.names=d$ID,ploidy = 2,NA.char="000",pop=pop) #create genid object for just Halbert microsats
  rm(pop)

## filter missing data
  missingloc=propTyped(df,by="ind") #identify missing data from each individual
  removeind=names(missingloc[which(missingloc<1)])
  df=df[!row.names(df@tab) %in% removeind] #remove individual with missing data
  rm(missingloc,removeind)

  
## summarize data
    b=basic.stats(df, diploid = TRUE)
    smry=summary(df)

    #Hardy-Weinberg
      barplot(smry$Hexp-smry$Hobs, main="Heterozygosity: expected-observed",ylab="Hexp - Hobs")
      mean(smry$Hobs); sd(smry$Hobs); min(smry$Hobs); max(smry$Hobs)
      mean(smry$Hexp); sd(smry$Hexp); min(smry$Hexp); max(smry$Hexp)
      bartlett.test(list(smry$Hexp,smry$Hobs)) #tst of variance
      t.test(smry$Hexp,smry$Hobs,pair=T,var.equal=TRUE,alter="greater") #test of means
      hwt=hw.test(df, B=0)#hwt test per loci
        hwt[which(hwt[,3]<0.05),]#report loci that failed H-W-E
    
    #allelic richness
      barplot(smry$loc.n.all, ylab="Number of alleles",main="Number of alleles per locus")
      ar=allelic.richness(df,min.n=NULL,diploid=TRUE)
          mean(ar$Ar[,1]);sd(ar$Ar[,1])
 
    # Inbreeding coefficeint
      mean(b$Fis[,1]); sd(b$Fis[,1])
  
    # Structure
      #pca
      pca1=dudi.pca(tab(df),scannf=FALSE, scale=FALSE, nf=3)
      percents = pca1$eig/sum(pca1$eig)*100
      barplot(percents, ylab = "Genetic variance explained by eigenvectors (%)",names.arg = round(percents, 1))
      pal=brewer.pal(8,"Accent")
      s.class(pca1$li, pop(df),label=NA)
      title("PCA of Bison\naxes 1-2")
      add.scatter.eig(pca1$eig[1:20], 3,1,2)
      #k-means cluster
      grp=find.clusters(df, max.n.clust=5) #choose pcas and clusters interactively
   
    rm(hwt,ar,pca1,percents,pal,grp)

    
    
    
## Compare to Halbert 2012
    
    dat.sum=read.csv("Q:\\NEW O DRIVE\\Population Conservation\\Genetics\\population structure\\data files\\MicroSat_Summary_Halbert_2012.csv",header=TRUE)
      names(dat.sum)=c("loci","n.alleles","AR","Ho","Hs")
      #remove inconsistent loci (44 good loci)
      missingloci=c("BM188","BM1905")
      dat.sum=dat.sum[!dat.sum$loci%in%missingloci,]
      dat.sum=na.omit(dat.sum)
  
    #look at heterozygosity, allelic richness, inbreeding coefficient
    dat.sum=data.frame(dat.sum,
                         "currentHo"=basic.stats(dfsub, diploid = TRUE)$perloc[,1],
                         "currentHs"=basic.stats(dfsub, diploid = TRUE)$perloc[,2],
                         "currentAR"=allelic.richness(dfsub,min.n=NULL,diploid=TRUE)$Ar[,1])
    dat.sum$diffHS=dat.sum$currentHs-dat.sum$Hs
    dat.sum$diffAr=dat.sum$currentAR-dat.sum$AR
    basic.stats(dfsub, diploid = TRUE)$overall
    data.frame("Hs"=basic.stats(dfsub, diploid = TRUE)$Hs[,1],
               "Fis"=basic.stats(dfsub, diploid = TRUE)$Fis[,1],
               "Ar"=allelic.richness(dfsub,min.n=NULL,diploid=TRUE)$Ar[,1])
    mean(allelic.richness(dfsub,min.n=NULL,diploid=TRUE)$Ar[,1])
    mean(basic.stats(dfsub, diploid = TRUE)$Hs[,1])
    mean(basic.stats(dfsub, diploid = TRUE)$Fis[,1])
    
    
    
    dat.allele=read.csv("Q:\\NEW O DRIVE\\Population Conservation\\Genetics\\population structure\\data files\\MicroSat_AlleleFreq_Halbert_2012.csv",header=TRUE)
      dat.allele=dat.allele[,c(1,2,5)]  
      names(dat.allele)=c("loci","allele","freq")
      #remove inconsistent loci (44 good loci)
      missingloci=c("BM188","BM1905")
      dat.allele=dat.allele[!dat.allele$loci%in%missingloci,]
      dat.allele=na.omit(dat.allele)
    
    #prep data to calculate allele frequencies
    dat=data.frame("ID"=d$ID,loci.halbert)
      dat<- melt(dat, id.vars = c("ID"))
      x=unlist(strsplit(dat$value,"/"))
      dat$A1=x[seq(1,length(x),2)]
      dat$A2=x[seq(2,length(x),2)]
      dat=melt(dat,id.vars=c("ID","variable"),measure.vars = c("A1","A2"))
      dat=dat[,-3]
      dat$value=as.numeric(dat$value)
    #get allele frequencies
      dat=do.call(rbind,lapply(1:length(unique(dat$variable)),function(i){
        temp=subset(dat,dat$variable==unique(dat$variable)[i])
        tab=table(temp$value)
        out=data.frame(unique(dat$variable)[i],tab/sum(tab))
        names(out)=c("loci","allele","freq2")
        out$allele=as.numeric(as.character(out$alle))
        return(out)}))
      x=merge(dat.allele, dat,all.x=TRUE,all.y=TRUE)
      x$freq[is.na(x$freq)]=0
      x$freq2[is.na(x$freq2)]=0
      
      